Intestine development by Huycke et al.
Basic setup¶
In [1]:
Copied!
%load_ext autoreload
%autoreload 2
%load_ext autoreload
%autoreload 2
In [2]:
Copied!
import Concord as ccd
import scanpy as sc
import torch
import warnings
warnings.filterwarnings('ignore')
data_path = "../data/intestine_dev/intestine_adata.h5ad"
adata = sc.read(
data_path
)
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
ccd.ul.score_cell_cycle(adata, organism='Mm')
import Concord as ccd
import scanpy as sc
import torch
import warnings
warnings.filterwarnings('ignore')
data_path = "../data/intestine_dev/intestine_adata.h5ad"
adata = sc.read(
data_path
)
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
ccd.ul.score_cell_cycle(adata, organism='Mm')
Concord - INFO - Processed 43 human genes to mouse orthologs. Concord - INFO - Processed 54 human genes to mouse orthologs.
In [3]:
Copied!
import time
from pathlib import Path
proj_name = "concord_Huycke_intestine"
save_dir = f"../save/dev_{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
file_suffix = f"{proj_name}_{time.strftime('%b%d')}"
seed = 0
import time
from pathlib import Path
proj_name = "concord_Huycke_intestine"
save_dir = f"../save/dev_{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
file_suffix = f"{proj_name}_{time.strftime('%b%d')}"
seed = 0
Define feature input and estimate dropout rate in data¶
Dropout is one of the major patterns in single cell data. Even nearby cells can have significant proportion of non-overlapping non-zero genes, which could be due to technical drop-out or real biology (genes stochastically turned on or off). Concord 'augments' the data by introducing dropout to each of the cell, and use contrastive learning to 'self-supervise' the model to learn the cell identity despite the presence of dropout. The augment dropout probability can be custom-defined, or be estimated based on the average dropout rate in the current data. Here we provide a detailed example how this is calculated:
In [103]:
Copied!
# Select top variably expressed/accessible features for analysis (other methods besides seurat_v3 available)
output_key = 'Concord'
feature_list = ccd.ul.select_features(adata, n_top_features=10000, flavor='seurat_v3')
# Determine augmentation mask probabilities based on estimated dropout rate from knn neighbors
dropout_est = ccd.ul.estimate_aug_mask_prob(adata, input_feature = feature_list, n_samples = adata.n_obs, return_mean=False, plotting=True)
# Select top variably expressed/accessible features for analysis (other methods besides seurat_v3 available)
output_key = 'Concord'
feature_list = ccd.ul.select_features(adata, n_top_features=10000, flavor='seurat_v3')
# Determine augmentation mask probabilities based on estimated dropout rate from knn neighbors
dropout_est = ccd.ul.estimate_aug_mask_prob(adata, input_feature = feature_list, n_samples = adata.n_obs, return_mean=False, plotting=True)
Concord.utils.feature_selector - INFO - Selecting highly variable features with flavor seurat_v3...
In [86]:
Copied!
# Save for concord run and visualization
adata.obs['dropout_est'] = dropout_est
aug_prob = dropout_est.mean()
# Save for concord run and visualization
adata.obs['dropout_est'] = dropout_est
aug_prob = dropout_est.mean()
Here's one specific example showing the gene set difference between a cell and its 3 nearest neighbors.
In [73]:
Copied!
from Concord.model.knn import Neighborhood
import numpy as np
emb = adata.obsm['X_pca']
neighborhood = Neighborhood(emb=emb, k=3)
core_samples = np.array([9000])
X = adata.X.toarray()
avg_distances = neighborhood.average_knn_distance(core_samples, X, k=3, distance_metric='drop_diff')
print(avg_distances)
indices = neighborhood.get_knn_indices(core_samples, k=3, include_self=False)
print(adata.obs['cell_type'].iloc[core_samples])
print(adata.obs['cell_type'].iloc[indices.flatten()])
from Concord.model.knn import Neighborhood
import numpy as np
emb = adata.obsm['X_pca']
neighborhood = Neighborhood(emb=emb, k=3)
core_samples = np.array([9000])
X = adata.X.toarray()
avg_distances = neighborhood.average_knn_distance(core_samples, X, k=3, distance_metric='drop_diff')
print(avg_distances)
indices = neighborhood.get_knn_indices(core_samples, k=3, include_self=False)
print(adata.obs['cell_type'].iloc[core_samples])
print(adata.obs['cell_type'].iloc[indices.flatten()])
Concord.model.knn - INFO - Using FAISS CPU index. Concord.model.knn - INFO - Building Faiss FlatL2 index. Concord.model.knn - INFO - Using FAISS CPU index.
Out[73]:
array([0.4884242])
In [81]:
Copied!
import seaborn as sns
import sys
sys.setrecursionlimit(10000)
mtx = adata.X.toarray()
core_nonzero = (mtx[core_samples] > 0) # Boolean array for non-zero genes in core cells
neighbor_nonzero = (mtx[indices] > 0) # Boolean array for non-zero genes in neighbors
cbn_indices = np.concatenate([core_samples, indices.flatten()])
sns.clustermap(mtx[cbn_indices], row_cluster=False, col_cluster=True, cmap='viridis', figsize=(7, 3))
import seaborn as sns
import sys
sys.setrecursionlimit(10000)
mtx = adata.X.toarray()
core_nonzero = (mtx[core_samples] > 0) # Boolean array for non-zero genes in core cells
neighbor_nonzero = (mtx[indices] > 0) # Boolean array for non-zero genes in neighbors
cbn_indices = np.concatenate([core_samples, indices.flatten()])
sns.clustermap(mtx[cbn_indices], row_cluster=False, col_cluster=True, cmap='viridis', figsize=(7, 3))
Out[81]:
<seaborn.matrix.ClusterGrid at 0x7f22879c9c40>
Run Concord¶
In [91]:
Copied!
cur_ccd = ccd.Concord(adata=adata, input_feature=feature_list, domain_key='LaneID',
latent_dim=32,
augmentation_mask_prob = aug_prob,
clr_temperature = .3, # Check out advanced usage to learn what this parameter controls
min_p_intra_domain=1.0,
seed=seed,
inplace=False,
verbose=True,
device=device)
# Encode data, saving the latent embedding in adata.obsm['Concord']
cur_ccd.encode_adata(input_layer_key='X_log1p', output_key=output_key)
# Save the latent embedding to a filem, so that it can be loaded later
ccd.ul.save_obsm_to_hdf5(cur_ccd.adata, save_dir / f"obsm_{file_suffix}.h5")
adata.obsm = cur_ccd.adata.obsm # If not inplace
cur_ccd = ccd.Concord(adata=adata, input_feature=feature_list, domain_key='LaneID',
latent_dim=32,
augmentation_mask_prob = aug_prob,
clr_temperature = .3, # Check out advanced usage to learn what this parameter controls
min_p_intra_domain=1.0,
seed=seed,
inplace=False,
verbose=True,
device=device)
# Encode data, saving the latent embedding in adata.obsm['Concord']
cur_ccd.encode_adata(input_layer_key='X_log1p', output_key=output_key)
# Save the latent embedding to a filem, so that it can be loaded later
ccd.ul.save_obsm_to_hdf5(cur_ccd.adata, save_dir / f"obsm_{file_suffix}.h5")
adata.obsm = cur_ccd.adata.obsm # If not inplace
Concord - INFO - Setting sampler_knn to 1309 to be 1/50 the number of cells in the dataset. You can change this value by setting sampler_knn in the configuration. Concord - INFO - Column 'LaneID' is already of type: category Concord - INFO - Unused levels dropped for column 'LaneID'. Concord - INFO - Encoder input dim: 10000 Concord - INFO - Decoder input dim: 40 Concord - INFO - Model loaded to device: cuda:0 Concord - INFO - Total number of parameters: 2590128 Concord.model.dataloader - INFO - Preprocessing adata... Concord.utils.preprocessor - INFO - Data is already log1p transformed. Skip normalization. Concord.utils.preprocessor - INFO - Data is already log1p transformed. Storing in the specified layer. Concord.utils.preprocessor - INFO - Filtering features with provided list (10000 features)... Concord.model.anndataset - INFO - Initialized dataset with 65468 samples. Data structure: ['input', 'domain', 'idx'] Concord.model.dataloader - INFO - Using existing embedding 'X_pca' from adata.obsm Concord.model.knn - INFO - Using FAISS CPU index. Concord.model.knn - INFO - Building Faiss IVF index. nprobe=10 Concord.model.knn - INFO - Using FAISS CPU index. Concord.model.dataloader - INFO - Number of unique_domains: 6 Concord.model.dataloader - INFO - Calculating each domain's coverage of the global manifold using X_pca. Concord.model.dataloader - INFO - Converting coverage to p_intra_domain... Concord.model.dataloader - INFO - Final p_intra_domain values: L1: 1.00, L2: 1.00, R: 1.00, Live_1: 1.00, Live_2: 1.00, Rare: 1.00 Concord - INFO - Starting epoch 1/5 Concord - INFO - Processing chunk 1/1 for epoch 1 Concord - INFO - Number of samples in train_dataloader: 65468
Epoch 0 Training: 1020it [00:32, 31.28it/s, loss=3.05]
Concord - INFO - Epoch 0 | Train Loss: 3.33, MSE: 0.09, CLASS: 0.00, CONTRAST: 3.24, IMPORTANCE: 0.00 Concord - INFO - Starting epoch 2/5 Concord - INFO - Processing chunk 1/1 for epoch 2 Concord - INFO - Number of samples in train_dataloader: 65468
Epoch 1 Training: 100%|██████████| 1020/1020 [00:30<00:00, 33.47it/s, loss=3.06]
Concord - INFO - Epoch 1 | Train Loss: 2.99, MSE: 0.06, CLASS: 0.00, CONTRAST: 2.93, IMPORTANCE: 0.00 Concord - INFO - Starting epoch 3/5 Concord - INFO - Processing chunk 1/1 for epoch 3 Concord - INFO - Number of samples in train_dataloader: 65468
Epoch 2 Training: 100%|██████████| 1020/1020 [00:27<00:00, 36.61it/s, loss=2.94]
Concord - INFO - Epoch 2 | Train Loss: 2.94, MSE: 0.06, CLASS: 0.00, CONTRAST: 2.88, IMPORTANCE: 0.00 Concord - INFO - Starting epoch 4/5 Concord - INFO - Processing chunk 1/1 for epoch 4 Concord - INFO - Number of samples in train_dataloader: 65468
Epoch 3 Training: 100%|██████████| 1020/1020 [00:28<00:00, 36.01it/s, loss=3.09]
Concord - INFO - Epoch 3 | Train Loss: 2.91, MSE: 0.06, CLASS: 0.00, CONTRAST: 2.85, IMPORTANCE: 0.00 Concord - INFO - Starting epoch 5/5 Concord - INFO - Processing chunk 1/1 for epoch 5 Concord - INFO - Number of samples in train_dataloader: 65468
Epoch 4 Training: 100%|██████████| 1020/1020 [00:29<00:00, 34.81it/s, loss=3.05]
Concord - INFO - Epoch 4 | Train Loss: 2.89, MSE: 0.06, CLASS: 0.00, CONTRAST: 2.83, IMPORTANCE: 0.00
Concord - INFO - Model saved to save/final_model.pth Concord - INFO - Final model saved at: save/final_model.pth; Configuration saved at: save/config.json. Concord.model.dataloader - INFO - Preprocessing adata... Concord.utils.preprocessor - INFO - Data is already log1p transformed. Skip normalization. Concord.utils.preprocessor - INFO - Data is already log1p transformed. Storing in the specified layer. Concord.utils.preprocessor - INFO - Filtering features with provided list (10000 features)... Concord.model.anndataset - INFO - Initialized dataset with 65468 samples. Data structure: ['input', 'domain', 'idx'] Concord - INFO - Predicting for chunk 1/1
Visualize Concord latent with UMAP¶
2D UMAP¶
In [100]:
Copied!
#ccd.ul.run_umap(adata, source_key=output_key, umap_key=f'{output_key}_UMAP', n_components=2, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)
show_basis = f'{output_key}_UMAP'
show_cols = ['MouseAge_combined', 'seg_classify', 'phase', 'batch', 'cell_type', "mes_subtype", "dropout_est"]
ccd.pl.plot_embedding(
adata, show_basis, show_cols, figsize=(13,12), dpi=600, ncols=3, font_size=5, point_size=1, legend_loc='on data',
save_path=save_dir / f"{show_basis}_{file_suffix}.png"
)
#ccd.ul.run_umap(adata, source_key=output_key, umap_key=f'{output_key}_UMAP', n_components=2, n_neighbors=30, min_dist=0.1, metric='euclidean', random_state=seed)
show_basis = f'{output_key}_UMAP'
show_cols = ['MouseAge_combined', 'seg_classify', 'phase', 'batch', 'cell_type', "mes_subtype", "dropout_est"]
ccd.pl.plot_embedding(
adata, show_basis, show_cols, figsize=(13,12), dpi=600, ncols=3, font_size=5, point_size=1, legend_loc='on data',
save_path=save_dir / f"{show_basis}_{file_suffix}.png"
)